#!pip install -r requirements.txt
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
pd.options.plotting.backend = "plotly"
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
import plotly_express as px
import plotly.figure_factory as ff
from IPython.display import Image
from src.wk_means import WKMeans
from src.clustering_utils import *
In this notebook, the goal is to apply the Wasserstein k-means market regime clustering on the sp500 data.
index_data = pd.read_csv('data/SPY.csv', index_col=0, parse_dates=True)['Adj Close'].rename('SPY')
fig = index_data.plot()
fig.update_layout(xaxis_title='date',
yaxis_title = 'raw',)
fig.show()
fig = index_data.apply(np.log).diff().plot()
fig.update_layout(xaxis_title='date',
yaxis_title='log returns',)
fig.show()
Next, we generate our groun truth data that will serve as benchmark.
Each observation represents a distribution. An emprical measure of a distributin is defined as : $$ \mu^{r}((-\infty, x])=\frac{1}{N} \sum_{i=1}^{N} \chi_{\left\{Q^{i}(r) \leq x\right)}(x), $$
where $\chi: \mathbb{R} \rightarrow[0,1]$ is the indicator function and $Q^{j}: \mathcal{S}(\mathbb{R}) \rightarrow \mathbb{R}$ is j_th order stat for j in 1..N.
The Wasserstein distance between two empirical measures is defined as :
$$ \mathcal{W}_{p}(\mu, \nu):=\min _{\mathbb{P} \in \Pi(\mu, \nu)}\left\{\int_{X \times X} d(x, y)^{p} \mathbb{P}(d x, d y)\right\}, $$where $\Pi(\mu, \nu):=\{\mathbb{P} \in \mathcal{P}(X \times X): \mathbb{P}(A \times X)=\mu(A), \quad \mathbb{P}(X \times B)=\nu(B)\} $ is the set of transport plans between $\mu$ and $\nu$.
and $$ d(x,y) = ||x-y||_X $$
You can see it as the minimum "cost" of turning one distribution into the other.
However, we can compute the W distance between two emprical measures using this simplification :
where $\left(\alpha_{i}\right)_{1 \leq i \leq N}$ and $\left(\beta_{i}\right)_{1 \leq i \leq N}$ are increasing sequences corresponding to the atoms of $\mu$ and $\nu$. (cf $\frac{1}{N} \sum_{i=1}^{N} \chi_{\left\{Q^{i}(r) \leq x\right)}(x)$)
When p=1, the Wassertstein distance is equivalent to the Earth Mover's Distance.
Image(filename='src/EMD.jpg')
h1 = 20 #stride # 20 days
h2 = 1 #step
stream = index_data.apply(np.log).diff().ffill().dropna()
features_wk_means, lift = partition(stream, h1, h2, return_lift=True)
features_wk_means
Now we will initialize and fit a WK means model :
estimator_wk_means = WKMeans(n_clusters=2, random_state=2022)
estimator_wk_means.fit(features_wk_means)
labels_wk_means = create_labels(estimator_wk_means, features_wk_means)
fig = plot_regime_time_series(labels_wk_means, index_data, return_fig=True)
fig.show()
To have another view on the clustering, we plot each set of returns in the mean/vol space :
# Here, the centroids are calculated as the Wasserstein barycenter of each cluster
# then, for each barycenter, we compute the mean and the std
mean_volatility_plot(lift,labels_wk_means, estimator_wk_means.cluster_centers_, np.array(estimator_wk_means.cluster_centers_).mean(axis=1),
np.array(estimator_wk_means.cluster_centers_).std(axis=1))